In [1]:
import numpy as np
import pandas as pd
In [6]:
def transform(points_x, k=1):
n = points_x.shape[0]
m = 8
points_z = np.zeros([n, m])
points_z[:,0] = 1 # add intercept
points_z[:,1:3] = points_x[:,0:2] # copy over the original features
points_z[:,3:5] = np.power(points_x[:, 0:2], 2) # powers
points_z[:,5] = points_x[:, 0] * points_x[:, 1] # product
points_z[:,6] = np.abs(points_x[:, 0] - points_x[:, 1]) # abs diff
points_z[:,7] = np.abs(points_x[:, 0] + points_x[:, 1]) # abs sum
return points_z
def solve_linear_regression(pi, labels):
return np.dot(np.dot(np.linalg.pinv(np.dot(pi.T, pi)), pi.T), labels)
def get_linear_regression_error(pi, labels, gx_vector):
predictions = np.sign(np.dot(pi, gx_vector))
return sum ((labels * predictions) < 0) / len(labels)
In [7]:
train = pd.read_csv("./data/in.dta", delim_whitespace=True, header=None).as_matrix()
test = pd.read_csv("./data/out.dta", delim_whitespace=True, header=None).as_matrix()
In [8]:
train_x = train[:,:2]
train_y = train[:,2]
test_x = test[:,:2]
test_y = test[:,2]
In [16]:
train_z = transform(train_x)
test_z = transform(test_x)
In [67]:
def argmin(z):
if not z: return None
min_val = min(z.values())
return [k for k in z if z[k] == min_val][0]
In [87]:
def cross_validate_linear_regression(t_size):
error_val = dict()
error_out = dict()
for k in range(3, 8):
gx_vector = solve_linear_regression(train_z[:t_size, :k+1], train_y[:t_size]) # linear regression without decay
error_val[k] = get_linear_regression_error(train_z[t_size:, :k+1], train_y[t_size:], gx_vector) # error in sample
error_out[k] = get_linear_regression_error(test_z[:,:k+1], test_y, gx_vector) # error out of sample
print("k = %d\t error_val = %.3f\terror_out = %.3f" % (k, error_val[k], error_out[k]))
best_k = argmin(error_val)
print("\nbest validation k = %d\terror_val = %.3f\t error_out = %.3f" % (best_k, error_val[best_k], error_out[k]))
best_k = argmin(error_out)
print("\nbest out-of-sample k = %d\terror_val = %.3f\t error_out = %.3f" % (best_k, error_val[best_k], error_out[k]))
In [88]:
cross_validate_linear_regression(t_size = 25)
In [90]:
cross_validate_linear_regression(t_size = 10)
In [103]:
# from math import
tot_e1 = 0
tot_e2 = 0
tot_e = 0
iterations = 10000
for i in range(iterations):
e1 = np.random.rand(1)
e2 = np.random.rand(1)
tot_e1 += e1
tot_e2 += e2
tot_e += min(e1, e2)
print(tot_e1 / iterations)
print(tot_e2 / iterations)
print(tot_e / iterations)